In [2]:
# GENDER AGENDAS MAPPER
# V3 - July 2025
# Developed by the Gender Justice Data Hub
# CC BY-NC-SA 4.0, Global Fund for Women
In [4]:
# Uninstall everything related
!pip uninstall -y torch torchvision torchaudio transformers sentence-transformers bertopic umap-learn hdbscan accelerate bitsandbytes xformers
# Clear pip cache
!pip cache purge
# First install PyTorch with CUDA
!pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
# Then the base dependencies
!pip install transformers==4.35.2 accelerate bitsandbytes
# Install sentence-transformers before BERTopic
!pip install sentence-transformers
# Finally install BERTopic and its dependencies
!pip install bertopic umap-learn hdbscan adjustText
Found existing installation: torch 2.2.2 Uninstalling torch-2.2.2: Successfully uninstalled torch-2.2.2 WARNING: Skipping torchvision as it is not installed. WARNING: Skipping torchaudio as it is not installed. Found existing installation: transformers 4.53.2 Uninstalling transformers-4.53.2: Successfully uninstalled transformers-4.53.2 Found existing installation: sentence-transformers 5.0.0 Uninstalling sentence-transformers-5.0.0: Successfully uninstalled sentence-transformers-5.0.0 Found existing installation: bertopic 0.17.3 Uninstalling bertopic-0.17.3: Successfully uninstalled bertopic-0.17.3 Found existing installation: umap-learn 0.5.9.post2 Uninstalling umap-learn-0.5.9.post2: Successfully uninstalled umap-learn-0.5.9.post2 Found existing installation: hdbscan 0.8.40 Uninstalling hdbscan-0.8.40: Successfully uninstalled hdbscan-0.8.40 Found existing installation: accelerate 1.8.1 Uninstalling accelerate-1.8.1: Successfully uninstalled accelerate-1.8.1 Found existing installation: bitsandbytes 0.42.0 Uninstalling bitsandbytes-0.42.0: Successfully uninstalled bitsandbytes-0.42.0 WARNING: Skipping xformers as it is not installed. Files removed: 68 Looking in indexes: https://download.pytorch.org/whl/cu124 ERROR: Could not find a version that satisfies the requirement torch==2.6.0 (from versions: none) ERROR: No matching distribution found for torch==2.6.0 Collecting transformers==4.35.2 Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 123.5/123.5 kB 4.1 MB/s eta 0:00:00 Collecting accelerate Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB) Collecting bitsandbytes Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (3.13.1) Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.33.4) Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (1.26.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (23.1) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (6.0.1) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2023.10.3) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (2.32.3) Collecting tokenizers<0.19,>=0.14 (from transformers==4.35.2) Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB) Requirement already satisfied: safetensors>=0.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (0.5.2) Requirement already satisfied: tqdm>=4.27 in /opt/anaconda3/lib/python3.11/site-packages (from transformers==4.35.2) (4.65.0) Requirement already satisfied: psutil in /opt/anaconda3/lib/python3.11/site-packages (from accelerate) (5.9.0) Collecting torch>=2.0.0 (from accelerate) Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl.metadata (25 kB) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from bitsandbytes) (1.11.4) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (2023.6.0) Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (4.14.0) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub<1.0,>=0.16.4->transformers==4.35.2) (1.1.5) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=2.0.0->accelerate) (3.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->transformers==4.35.2) (2025.4.26) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=2.0.0->accelerate) (2.1.3) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=2.0.0->accelerate) (1.3.0) Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.9/7.9 MB 11.3 MB/s eta 0:00:0000:0100:01 Downloading accelerate-1.9.0-py3-none-any.whl (367 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 367.1/367.1 kB 7.9 MB/s eta 0:00:00:00:01 Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.0/105.0 MB 5.5 MB/s eta 0:00:0000:0100:01 Downloading tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl (2.6 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 3.3 MB/s eta 0:00:00a 0:00:01 Downloading torch-2.2.2-cp311-none-macosx_10_9_x86_64.whl (150.8 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.8/150.8 MB 5.2 MB/s eta 0:00:0000:0100:01 Installing collected packages: torch, bitsandbytes, tokenizers, accelerate, transformers Attempting uninstall: tokenizers Found existing installation: tokenizers 0.21.2 Uninstalling tokenizers-0.21.2: Successfully uninstalled tokenizers-0.21.2 Successfully installed accelerate-1.9.0 bitsandbytes-0.42.0 tokenizers-0.15.2 torch-2.2.2 transformers-4.35.2 Collecting sentence-transformers Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB) Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers) Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.9/40.9 kB 1.3 MB/s eta 0:00:00 Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0) Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2) Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4) Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4) Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0) Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0) Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3) Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3) Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers) Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.8 kB) Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2) Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0) Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 470.2/470.2 kB 7.1 MB/s eta 0:00:00a 0:00:01 Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.8/10.8 MB 10.5 MB/s eta 0:00:0000:010:01 Downloading tokenizers-0.21.2-cp39-abi3-macosx_10_12_x86_64.whl (2.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.9/2.9 MB 11.2 MB/s eta 0:00:0000:0100:01 Installing collected packages: tokenizers, transformers, sentence-transformers Attempting uninstall: tokenizers Found existing installation: tokenizers 0.15.2 Uninstalling tokenizers-0.15.2: Successfully uninstalled tokenizers-0.15.2 Attempting uninstall: transformers Found existing installation: transformers 4.35.2 Uninstalling transformers-4.35.2: Successfully uninstalled transformers-4.35.2 Successfully installed sentence-transformers-5.0.0 tokenizers-0.21.2 transformers-4.53.2 Collecting bertopic Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB) Collecting umap-learn Downloading umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB) Collecting hdbscan Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl.metadata (15 kB) Requirement already satisfied: adjustText in /opt/anaconda3/lib/python3.11/site-packages (1.3.0) Requirement already satisfied: numpy>=1.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.26.4) Requirement already satisfied: pandas>=1.1.5 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (2.1.4) Requirement already satisfied: plotly>=4.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.9.0) Requirement already satisfied: scikit-learn>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (1.6.1) Requirement already satisfied: sentence-transformers>=0.4.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (5.0.0) Requirement already satisfied: tqdm>=4.41.1 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (4.65.0) Requirement already satisfied: llvmlite>0.36.0 in /opt/anaconda3/lib/python3.11/site-packages (from bertopic) (0.42.0) Requirement already satisfied: scipy>=1.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (1.11.4) Requirement already satisfied: numba>=0.51.2 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.59.0) Requirement already satisfied: pynndescent>=0.5 in /opt/anaconda3/lib/python3.11/site-packages (from umap-learn) (0.5.13) Requirement already satisfied: joblib>=1.0 in /opt/anaconda3/lib/python3.11/site-packages (from hdbscan) (1.2.0) Requirement already satisfied: matplotlib in /opt/anaconda3/lib/python3.11/site-packages (from adjustText) (3.8.0) Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in /opt/anaconda3/lib/python3.11/site-packages (from pandas>=1.1.5->bertopic) (2023.3) Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from plotly>=4.7.0->bertopic) (8.2.2) Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn>=1.0->bertopic) (3.5.0) Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.53.2) Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (2.2.2) Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (0.33.4) Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (10.2.0) Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers>=0.4.1->bertopic) (4.14.0) Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.2.0) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (1.4.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (23.1) Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.11/site-packages (from matplotlib->adjustText) (3.0.9) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.13.1) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2023.6.0) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (6.0.1) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.32.3) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (1.1.5) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (3.1.3) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (2023.10.3) Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.21.2) Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers>=0.4.1->bertopic) (0.5.2) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers>=0.4.1->bertopic) (2025.4.26) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers>=0.4.1->bertopic) (1.3.0) Downloading bertopic-0.17.3-py3-none-any.whl (153 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 153.0/153.0 kB 5.3 MB/s eta 0:00:00 Downloading umap_learn-0.5.9.post2-py3-none-any.whl (90 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 90.1/90.1 kB 8.6 MB/s eta 0:00:00 Downloading hdbscan-0.8.40-cp311-cp311-macosx_10_9_universal2.whl (1.5 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 3.5 MB/s eta 0:00:00a 0:00:01m Installing collected packages: hdbscan, umap-learn, bertopic Successfully installed bertopic-0.17.3 hdbscan-0.8.40 umap-learn-0.5.9.post2
In [5]:
!pip install openai --upgrade
Requirement already satisfied: openai in /opt/anaconda3/lib/python3.11/site-packages (1.96.0) Collecting openai Downloading openai-1.97.0-py3-none-any.whl.metadata (29 kB) Requirement already satisfied: anyio<5,>=3.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.2.0) Requirement already satisfied: distro<2,>=1.7.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.8.0) Requirement already satisfied: httpx<1,>=0.23.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.28.1) Requirement already satisfied: jiter<1,>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (0.10.0) Requirement already satisfied: pydantic<3,>=1.9.0 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (2.8.2) Requirement already satisfied: sniffio in /opt/anaconda3/lib/python3.11/site-packages (from openai) (1.3.0) Requirement already satisfied: tqdm>4 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.65.0) Requirement already satisfied: typing-extensions<5,>=4.11 in /opt/anaconda3/lib/python3.11/site-packages (from openai) (4.14.0) Requirement already satisfied: idna>=2.8 in /opt/anaconda3/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.10) Requirement already satisfied: certifi in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2025.4.26) Requirement already satisfied: httpcore==1.* in /opt/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.7) Requirement already satisfied: h11<0.15,>=0.13 in /opt/anaconda3/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0) Requirement already satisfied: annotated-types>=0.4.0 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (0.6.0) Requirement already satisfied: pydantic-core==2.20.1 in /opt/anaconda3/lib/python3.11/site-packages (from pydantic<3,>=1.9.0->openai) (2.20.1) Downloading openai-1.97.0-py3-none-any.whl (764 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 765.0/765.0 kB 8.8 MB/s eta 0:00:0000:0100:01 Installing collected packages: openai Attempting uninstall: openai Found existing installation: openai 1.96.0 Uninstalling openai-1.96.0: Successfully uninstalled openai-1.96.0 Successfully installed openai-1.97.0
In [8]:
!pip install polars-lts-cpu
Requirement already satisfied: polars-lts-cpu in /opt/anaconda3/lib/python3.11/site-packages (1.31.0)
In [10]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
from bertopic import BERTopic
print("BERTopic imported successfully")
from sentence_transformers import SentenceTransformer
print("SentenceTransformers is working")
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
import openai
PyTorch version: 2.2.2 CUDA available: False
<frozen importlib._bootstrap>:241: RuntimeWarning: pyarrow.lib.IpcReadOptions size changed, may indicate binary incompatibility. Expected 96 from C header, got 104 from PyObject
BERTopic imported successfully SentenceTransformers is working
In [11]:
import pandas as pd
import re
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from bertopic import BERTopic
from bertopic.representation import OpenAI, KeyBERTInspired, MaximalMarginalRelevance
# Load API key from .env
from dotenv import load_dotenv
import os
import openai
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
print("Packages loaded successfully.")
Packages loaded successfully.
[nltk_data] Downloading package stopwords to /Users/Condi/nltk_data... [nltk_data] Package stopwords is already up-to-date!
In [16]:
import openai
from dotenv import load_dotenv
import os
# Load API key from .env file
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
# Create OpenAI client for v1.x
client = openai.OpenAI(api_key=api_key)
# Load topic representation
from bertopic.representation import OpenAI as OpenAI_Representation
representation_model = OpenAI_Representation(
client=client,
model="gpt-4o",
delay_in_seconds=10
)
# Prompt
prompt = """
I have a topic that contains the following documents: [DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
Based on the information above, extract a short topic label in the following format:
topic: <topic label>
"""
In [20]:
import pandas as pd
file_list = [
"Input/gender_Apr25-1.csv"
]
# Read and convert date without filtering
dfs = []
for f in file_list:
df_temp = pd.read_csv(f)
df_temp["event_date"] = pd.to_datetime(df_temp["event_date"])
dfs.append(df_temp)
# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)
In [22]:
print("Number of rows after filtering:", len(df))
print("Dates:", df["event_date"].min(), "→", df["event_date"].max())
Number of rows after filtering: 84118 Dates: 1997-01-06 00:00:00 → 2025-04-25 00:00:00
In [24]:
import polars as pl
pl_df = pl.from_pandas(df)
pl_df
Out[24]:
shape: (84_118, 31)
| event_id_cnty | event_date | year | time_precision | disorder_type | event_type | sub_event_type | actor1 | assoc_actor_1 | inter1 | actor2 | assoc_actor_2 | inter2 | interaction | civilian_targeting | iso | region | country | admin1 | admin2 | admin3 | location | latitude | longitude | geo_precision | source | source_scale | notes | fatalities | tags | timestamp |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | datetime[ns] | i64 | i64 | str | str | str | str | str | str | str | str | str | str | str | i64 | str | str | str | str | str | str | f64 | f64 | i64 | str | str | str | i64 | str | i64 |
| "ARG16601" | 2025-04-25 00:00:00 | 2025 | 1 | "Demonstrations" | "Protests" | "Peaceful protest" | "Protesters (Argentina)" | "Women (Argentina)" | "Protesters" | null | null | null | "Protesters only" | null | 32 | "South America" | "Argentina" | "Cordoba" | "Punilla" | null | "Capilla del Monte" | -30.8568 | -64.5258 | 1 | "El Diario de Carlos Paz" | "Subnational" | "On 25 April 2025, in Capilla d… | 0 | "crowd size=large" | 1745881584 |
| "BRA96908" | 2025-04-25 00:00:00 | 2025 | 2 | "Political violence" | "Violence against civilians" | "Attack" | "CV: Red Command" | null | "Political militia" | "Civilians (Brazil)" | "Women (Brazil)" | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 76 | "South America" | "Brazil" | "Bahia" | "Salvador" | null | "Salvador" | -12.9711 | -38.5108 | 1 | "Alo Juca; Bnews (Brazil)" | "Subnational-National" | "Around 25 April 2025 (as repor… | 1 | "women targeted: girls" | 1745881585 |
| "ISR45719" | 2025-04-25 00:00:00 | 2025 | 1 | "Demonstrations" | "Protests" | "Peaceful protest" | "Protesters (Israel)" | "Shift 101; Women (Israel)" | "Protesters" | null | null | null | "Protesters only" | null | 376 | "Middle East" | "Israel" | "Jerusalem" | "Jerusalem" | "Judean Mountains" | "Jerusalem" | 31.769 | 35.2163 | 1 | "Haaretz" | "National" | "On 25 April 2025, about 200 Is… | 0 | "crowd size=about 200" | 1745881590 |
| "MEX103000" | 2025-04-25 00:00:00 | 2025 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Unidentified Armed Group (Mexi… | null | "Political militia" | "Civilians (Mexico)" | "Labor Group (Mexico); Women (M… | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 484 | "North America" | "Mexico" | "Guanajuato" | "Leon" | null | "Leon de los Aldama" | 21.122 | -101.6832 | 1 | "Zona Franca" | "Subnational" | "On 25 April 2025, in Leon de l… | 1 | null | 1745881592 |
| "MEX103223" | 2025-04-25 00:00:00 | 2025 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Unidentified Gang (Mexico)" | null | "Political militia" | "Civilians (Mexico)" | "Women (Mexico)" | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 484 | "North America" | "Mexico" | "Veracruz de Ignacio de la Llav… | "Coxquihui" | null | "Sabanas de Xalostoc" | 20.2216 | -97.5349 | 1 | "Imagen del Golfo" | "Subnational" | "On 25 April 2025, in Sabanas d… | 2 | "women targeted: relatives of t… | 1745881593 |
| … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
| "UGA13" | 1997-03-05 00:00:00 | 1997 | 2 | "Political violence" | "Violence against civilians" | "Attack" | "LRA: Lords Resistance Army" | null | "Rebel group" | "Civilians (Uganda)" | "Women (Uganda)" | "Civilians" | "Rebel group-Civilians" | "Civilian targeting" | 800 | "Eastern Africa" | "Uganda" | "Lamwo" | "Lamwo" | "Palabek Kal" | "Palabek" | 3.4333 | 32.5667 | 1 | "New York Times" | "International" | "After failing to find deserter… | 9 | "women targeted: girls" | 1667868656 |
| "ALG50" | 1997-02-24 00:00:00 | 1997 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "GIA: Armed Islamic Group" | null | "Rebel group" | "Civilians (Algeria)" | "Women (Algeria)" | "Civilians" | "Rebel group-Civilians" | "Civilian targeting" | 12 | "Northern Africa" | "Algeria" | "Medea" | "Berrouaghia" | null | "Berrouaghia" | 36.1352 | 2.9109 | 1 | "Algeria Watch" | "Other" | "24 March: 5 young girls were k… | 5 | "women targeted: girls" | 1638981224 |
| "SIE4762" | 1997-01-22 00:00:00 | 1997 | 2 | "Political violence" | "Violence against civilians" | "Sexual violence" | "RUF: Revolutionary United Fron… | null | "Rebel group" | "Civilians (Sierra Leone)" | "Women (Sierra Leone)" | "Civilians" | "Rebel group-Civilians" | "Civilian targeting" | 694 | "Western Africa" | "Sierra Leone" | "Northern" | "Tonkolili" | "Kholifa Rowalla" | "Magburaka" | 8.7167 | -11.95 | 2 | "AFP" | "International" | "Week of 22 January. RUF forces… | 40 | "women targeted: girls" | 1638981224 |
| "RWA652" | 1997-01-15 00:00:00 | 1997 | 3 | "Political violence" | "Violence against civilians" | "Attack" | "Unidentified Armed Group (Rwan… | null | "Political militia" | "Civilians (Spain)" | "Aid Workers (Spain); Women (Sp… | "Civilians" | "Political militia-Civilians" | "Civilian targeting" | 646 | "Eastern Africa" | "Rwanda" | "North" | "Musanze" | "Cyuve" | "Ruhengeri" | -1.4998 | 29.635 | 1 | "Aid Worker Security Database" | "Local partner-Other" | "Around 15 January 1997 (month … | 3 | null | 1633983690 |
| "NIR1" | 1997-01-06 00:00:00 | 1997 | 1 | "Political violence" | "Violence against civilians" | "Attack" | "Tuareg Ethnic Militia (Niger)" | null | "Identity militia" | "Civilians (Niger)" | "Women (Niger)" | "Civilians" | "Identity militia-Civilians" | "Civilian targeting" | 562 | "Western Africa" | "Niger" | "Niamey" | "Ville de Niamey" | "Niamey III" | "Niamey" | 13.52 | 2.12 | 1 | "Reuters" | "International" | "A french woman was shot and ki… | 1 | null | 1622068223 |
In [26]:
import polars as pl
import pandas as pd
import re
# Get unique list of places and create the pattern
place_cols = ["country", "location", "admin1", "admin2", "admin3"]
place_series = [pl_df[col].drop_nulls().unique() for col in place_cols]
places = set()
for s in place_series:
places.update(s.to_list())
places = [l.strip() for l in places if isinstance(l, str)]
pattern = r'\b(' + '|'.join(map(re.escape, places)) + r')\b'
# Apply cleaning directly in Polars
pl_df_clean = pl_df.with_columns(
pl.col("notes")
.cast(pl.String)
.str.replace_all(pattern, "")
.str.replace_all(r"\b(19|20)\d{2}\b", "")
.alias("notes_clean")
)
# Extract the final list
documents = pl_df_clean["notes_clean"].drop_nulls().to_list()
In [28]:
titles = df["event_id_cnty"].dropna().tolist()
In [30]:
import pandas as pd
import polars as pl
def glimpse_polars(df: pl.DataFrame, max_cols=100, max_rows=5):
print(f"Observations: {df.height:,}")
print(f"Variables: {df.width:,}")
print("-" * 100)
col_info = []
for i, col_name in enumerate(df.columns):
if i < max_cols:
col_series = df.get_column(col_name)
dtype = col_series.dtype
non_null_count = col_series.len() - col_series.null_count()
unique_count = col_series.n_unique()
sample_values = col_series.drop_nulls().unique().slice(0, max_rows).to_list()
col_info.append({
"Variable": col_name,
"Type": dtype,
"Non-Null": f"{non_null_count:,}",
"Unique": f"{unique_count:,}",
"Sample Values": sample_values
})
col_info_df = pd.DataFrame(col_info)
print(col_info_df.to_string(index=False, max_colwidth=100))
if df.width > max_cols:
print(f"\n... and {df.width - max_cols} more variables")
print("-" * 100)
glimpse_polars(pl_df_clean)
Observations: 84,118
Variables: 32
----------------------------------------------------------------------------------------------------
Variable Type Non-Null Unique Sample Values
event_id_cnty String 84,118 84,118 [SWE6397, ARG3600, FRA20469, CAN3752, ITA14805]
event_date Datetime(time_unit='ns', time_zone=None) 84,118 5,331 [1997-01-06 00:00:00, 1997-01-15 00:00:00, 1997-01-22 00:00:00, 1997-02-24 00:00:00, 1997-03-05 0...
year Int64 84,118 29 [1997, 1998, 1999, 2000, 2001]
time_precision Int64 84,118 3 [1, 2, 3]
disorder_type String 84,118 4 [Strategic developments, Political violence, Demonstrations, Political violence; Demonstrations]
event_type String 84,118 5 [Violence against civilians, Riots, Explosions/Remote violence, Strategic developments, Protests]
sub_event_type String 84,118 17 [Disrupted weapons use, Sexual violence, Protest with intervention, Remote explosive/landmine/IED...
actor1 String 84,118 2,036 [Kashmir Rebels (India), Government of Kyrgyzstan (2017-2020), Rioters (Mauritius), Islamic State...
assoc_actor_1 String 62,034 12,807 [The Korean Council; Buddhist Group (South Korea); Students (South Korea); Women (South Korea), A...
inter1 String 84,118 8 [Protesters, Political militia, Rebel group, External/Other forces, State forces]
actor2 String 29,780 612 [Military Forces of Lebanon (2019-2020), Rioters (Guatemala), Police Forces of Guinea (2008-2010)...
assoc_actor_2 String 26,441 3,837 [Government of South Korea (2022-); Liberal Unification Party; PPP: People Power Party; Protestan...
inter2 String 29,780 9 [Protesters, State forces, Rebel group, Civilians, Rioters]
interaction String 84,118 23 [State forces-Rioters, Rioters-Protesters, State forces-Civilians, Rioters-Rioters, Identity mili...
civilian_targeting String 23,658 2 [Civilian targeting]
iso Int64 84,118 196 [0, 4, 8, 12, 20]
region String 84,118 16 [Eastern Africa, Caribbean, East Asia, Middle East, South Asia]
country String 84,118 196 [Luxembourg, Papua New Guinea, Sint Maarten, Germany, Egypt]
admin1 String 84,118 2,191 [Jablanica, Sint Maarten, Bogota, D.C., Imereti, Lusaka]
admin2 String 82,163 9,938 [Moro, Dalaho, Alcobaca, Chapulhuacan, Nueces]
admin3 String 32,852 6,400 [Juong Kang, City of Tshwane, Bafoussam 1, Rohat, Sakinyonga]
location String 84,118 20,553 [Tokyo, Guerrero Negro, Uruma, Qariyah, Colonie]
latitude Float64 84,118 20,919 [-54.8062, -54.5119, -53.7865, -53.1548, -51.7308]
longitude Float64 84,118 21,104 [-171.7553, -161.7558, -159.7804, -159.3721, -158.4575]
geo_precision Int64 84,118 3 [1, 2, 3]
source String 84,118 15,733 [El Tiempo (Colombia); La Opinion (Colombia), Melilla Hoy, Spectrum News Charlotte; Queen City Ne...
source_scale String 84,118 26 [Local partner-International, New media-National, Subnational, Local partner-National, Subnationa...
notes String 84,118 83,631 [On 8 March 2021, over 100 citizens called by the Feminist Action (AFdA) protested in Seu d'Urgel...
fatalities Int64 84,118 55 [0, 1, 2, 3, 4]
tags String 63,623 2,702 [crowd size=an estimated 200, crowd size=2,000-15,000, crowd size=around 300; local administrator...
timestamp Int64 84,118 16,192 [1559160369, 1559160524, 1559160525, 1559160526, 1559160527]
notes_clean String 84,118 78,463 [On 14 May , two men approached a same-sex couple, used anti-LGBTQ+ slur, punched one of the wome...
----------------------------------------------------------------------------------------------------
In [32]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI
# Embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)
# UMAP for dimensionality reduction (5D for clustering)
umap_model = UMAP(
n_neighbors=40,
n_components=5,
min_dist=0.4,
metric='cosine',
random_state=42
)
# HDBSCAN for clustering
hdbscan_model = HDBSCAN(
min_cluster_size=40,
min_samples=15,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
# Dimensionality reduction for visualization (2D)
reduced_embeddings = UMAP(
n_neighbors=15,
n_components=2,
min_dist=0.0,
metric='cosine',
random_state=42
).fit_transform(embeddings)
# Stopwords and vectorization
stopwords_total = set(stopwords.words("spanish") + stopwords.words("english"))
vectorizer_model = CountVectorizer(stop_words=list(stopwords_total))
# Semantic representations
representation_model = {
"KeyBERT": KeyBERTInspired(),
"MMR": MaximalMarginalRelevance(diversity=0.3),
"OpenAI": OpenAI(
client=client,
model="gpt-4o",
prompt=prompt
)
}
Batches: 0%| | 0/2629 [00:00<?, ?it/s]
In [34]:
topic_model = BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
representation_model=representation_model,
top_n_words=10,
verbose=True
)
# Fit the model and transform the documents
topics, probs = topic_model.fit_transform(documents, embeddings)
2025-07-16 14:45:57,349 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm 2025-07-16 14:47:41,683 - BERTopic - Dimensionality - Completed ✓ 2025-07-16 14:47:41,684 - BERTopic - Cluster - Start clustering the reduced embeddings huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) 2025-07-16 14:47:46,087 - BERTopic - Cluster - Completed ✓ 2025-07-16 14:47:46,098 - BERTopic - Representation - Fine-tuning topics using representation models. 100%|██████████| 266/266 [03:38<00:00, 1.22it/s] 2025-07-16 14:51:56,562 - BERTopic - Representation - Completed ✓
In [38]:
# Basic topic information
print(topic_model.get_topic_info())
# Keywords of a specific topic
print(topic_model.get_topic(0))
# Visualization
topic_model.visualize_topics()
Topic Count Name \
0 -1 40496 -1_women_group_protest_march
1 0 1364 0_found_body_wrapped_fatality
2 1 1152 1_ransom_abducted_kidnapped_unidentified
3 2 1121 2_conference_press_picketed_urged
4 3 1006 3_femicide_justice_case_demand
.. ... ... ...
261 260 40 260_conversions_aurat_marriages_enforced
262 261 40 261_japanese_weekly_apology_wednesday
263 262 40 262_bigger_50th_roe_anniversary
264 263 40 263_autism_children_simeonov_disabilities
265 264 40 264_hijab_religious_sociopolitical_veil
Representation \
0 [women, group, protest, march, members, woman,...
1 [found, body, wrapped, fatality, colonia, tied...
2 [ransom, abducted, kidnapped, unidentified, ar...
3 [conference, press, picketed, urged, korean, s...
4 [femicide, justice, case, demand, friends, fem...
.. ...
261 [conversions, aurat, marriages, enforced, mini...
262 [japanese, weekly, apology, wednesday, comfort...
263 [bigger, 50th, roe, anniversary, coincide, wad...
264 [autism, children, simeonov, disabilities, aut...
265 [hijab, religious, sociopolitical, veil, guida...
KeyBERT \
0 [protested, protesters, protest, demonstrators...
1 [strangled, decapitated, dismembered, beheaded...
2 [kidnapped, kidnappers, kidnapping, abductions...
3 [picketed, pickets, solidarity, discrimination...
4 [protested, protesters, protest, victims, acti...
.. ...
261 [protest, march, wage, wages, demonstration, w...
262 [reconciliation, seoul, participants, embassy,...
263 [protesters, protests, demonstrators, abortion...
264 [protested, disabilities, mothers, children, a...
265 [hijabs, hijab, niqabs, sharia, islamic, polic...
MMR \
0 [group, protest, march, woman, violence, polic...
1 [found, wrapped, fatality, colonia, plastic, t...
2 [ransom, abducted, kidnapped, unidentified, na...
3 [press, picketed, seoul, banners, harassment, ...
4 [justice, cases, victims, feminist, victim, co...
.. ...
261 [marriages, enforced, increase, march, demandi...
262 [japanese, apology, wednesday, comfort, embass...
263 [roe, january, commemorate, 51st, abortion, pr...
264 [autism, disabilities, valeri, insulting, parl...
265 [hijab, sociopolitical, veil, islamic, iranian...
OpenAI \
0 [Protests Against Gender-Based Violence and Wo...
1 [Femicide and Body Disposal Patterns in Urban ...
2 [Kidnappings and Ransom Demands by Armed Groups]
3 [Gender Equality and Anti-Discrimination Prote...
4 [Protests Demanding Justice for Femicide Victims]
.. ...
261 [Women's Rights and Demonstrations in Pakistan]
262 [Comfort Women Protests at Former Japanese Emb...
263 [Nationwide 'Bigger than Roe' Protests on Roe ...
264 [Protests Against Valeri Simeonov over Insulti...
265 [Enforcement of Religious Dress Codes by Irani...
Representative_Docs
0 [On 3 July , community members marched in (, ...
1 [Around 3 April (as reported), in , , a woman...
2 [On 5 March , unidentified armed men abducted ...
3 [On 4 September , protesters, including local ...
4 [On 10 January , in , , about 100 people, incl...
.. ...
261 [On 8 March , women held a demonstration durin...
262 [On 7 February , participants held signs and b...
263 [On 22 January , an unreported number of peopl...
264 [On 28 November , mothers of children with dis...
265 [Other: On 18 June , Iranian Guidance Patrol p...
[266 rows x 8 columns]
[('found', 0.06839050266583661), ('body', 0.0683816689112666), ('wrapped', 0.04236840290668702), ('fatality', 0.04149472584708486), ('colonia', 0.038043925667171755), ('tied', 0.03600635930266989), ('reported', 0.03454916678900737), ('plastic', 0.032110858453526715), ('signs', 0.028939472810125377), ('torture', 0.027406376602467974)]
In [40]:
topic_model.get_topic_info()
Out[40]:
| Topic | Count | Name | Representation | KeyBERT | MMR | OpenAI | Representative_Docs | |
|---|---|---|---|---|---|---|---|---|
| 0 | -1 | 40496 | -1_women_group_protest_march | [women, group, protest, march, members, woman,... | [protested, protesters, protest, demonstrators... | [group, protest, march, woman, violence, polic... | [Protests Against Gender-Based Violence and Wo... | [On 3 July , community members marched in (, ... |
| 1 | 0 | 1364 | 0_found_body_wrapped_fatality | [found, body, wrapped, fatality, colonia, tied... | [strangled, decapitated, dismembered, beheaded... | [found, wrapped, fatality, colonia, plastic, t... | [Femicide and Body Disposal Patterns in Urban ... | [Around 3 April (as reported), in , , a woman... |
| 2 | 1 | 1152 | 1_ransom_abducted_kidnapped_unidentified | [ransom, abducted, kidnapped, unidentified, ar... | [kidnapped, kidnappers, kidnapping, abductions... | [ransom, abducted, kidnapped, unidentified, na... | [Kidnappings and Ransom Demands by Armed Groups] | [On 5 March , unidentified armed men abducted ... |
| 3 | 2 | 1121 | 2_conference_press_picketed_urged | [conference, press, picketed, urged, korean, s... | [picketed, pickets, solidarity, discrimination... | [press, picketed, seoul, banners, harassment, ... | [Gender Equality and Anti-Discrimination Prote... | [On 4 September , protesters, including local ... |
| 4 | 3 | 1006 | 3_femicide_justice_case_demand | [femicide, justice, case, demand, friends, fem... | [protested, protesters, protest, victims, acti... | [justice, cases, victims, feminist, victim, co... | [Protests Demanding Justice for Femicide Victims] | [On 10 January , in , , about 100 people, incl... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 261 | 260 | 40 | 260_conversions_aurat_marriages_enforced | [conversions, aurat, marriages, enforced, mini... | [protest, march, wage, wages, demonstration, w... | [marriages, enforced, increase, march, demandi... | [Women's Rights and Demonstrations in Pakistan] | [On 8 March , women held a demonstration durin... |
| 262 | 261 | 40 | 261_japanese_weekly_apology_wednesday | [japanese, weekly, apology, wednesday, comfort... | [reconciliation, seoul, participants, embassy,... | [japanese, apology, wednesday, comfort, embass... | [Comfort Women Protests at Former Japanese Emb... | [On 7 February , participants held signs and b... |
| 263 | 262 | 40 | 262_bigger_50th_roe_anniversary | [bigger, 50th, roe, anniversary, coincide, wad... | [protesters, protests, demonstrators, abortion... | [roe, january, commemorate, 51st, abortion, pr... | [Nationwide 'Bigger than Roe' Protests on Roe ... | [On 22 January , an unreported number of peopl... |
| 264 | 263 | 40 | 263_autism_children_simeonov_disabilities | [autism, children, simeonov, disabilities, aut... | [protested, disabilities, mothers, children, a... | [autism, disabilities, valeri, insulting, parl... | [Protests Against Valeri Simeonov over Insulti... | [On 28 November , mothers of children with dis... |
| 265 | 264 | 40 | 264_hijab_religious_sociopolitical_veil | [hijab, religious, sociopolitical, veil, guida... | [hijabs, hijab, niqabs, sharia, islamic, polic... | [hijab, sociopolitical, veil, islamic, iranian... | [Enforcement of Religious Dress Codes by Irani... | [Other: On 18 June , Iranian Guidance Patrol p... |
266 rows × 8 columns
In [42]:
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
for topic_id, label_info in openai_topics.items():
label = label_info[0][0].split("\n")[0]
print(f"Topic {topic_id:>2}: {label}")
Topic -1: Protests Against Gender-Based Violence and Women's Rights Advocacy Topic 0: Femicide and Body Disposal Patterns in Urban Areas Topic 1: Kidnappings and Ransom Demands by Armed Groups Topic 2: Gender Equality and Anti-Discrimination Protests in South Korea Topic 3: Protests Demanding Justice for Femicide Victims Topic 4: Protests and Riots Over Mahsa Amini's Death Topic 5: Abortion Rights Protests Following Overturning of Roe v. Wade Topic 6: Anganwadi Workers' Protests for Regularization and Increased Wages Topic 7: Global Demonstrations for Abortion Legalization on September 28 Topic 8: Nationwide Flower Demonstrations in Support of MeToo Movement Against Sexual Violence Topic 9: Suppression of Petitioners and Rights Defenders During Political Conventions in China Topic 10: Political Protests by BJP and BJP Mahila Morcha Against INC and AIMC Actions Topic 11: Sexual Violence in Darfur by Militias and RSF Topic 12: Drive-by Shootings Involving Motorcycles and Female Victims Topic 13: Abductions by ISWAP/Boko Haram Militants Topic 14: Military Arrests and Detentions in Various Regions Topic 15: Meira Paibi Protests Over Arrest of UNLF-P Cadres Amidst Meitei-Tribal Tensions Topic 16: Women's Rights and Anti-Violence Activism by Kvinnostrejk Movement Topic 17: Female Student Protests and College Infrastructure Issues Topic 18: Protests Against Mahsa Amini's Death and Iranian Government Topic 19: Gun Violence Awareness and Advocacy Activities by Moms Demand Action Topic 20: Sexual Violence by Military Forces Topic 21: Comfort Women Protests and Japanese Government Apology Topic 22: Witchcraft-Related Violence and Accusations Topic 23: International Day for the Elimination of Violence against Women Protests Topic 24: Violence Against Transgender Individuals and Hate Crimes Topic 25: Asha Workers Protests for Wage Increase and Regularization in J&K Topic 26: Nationwide Protests Against Transphobia and Senate Bill Topic 27: University Student Protests for Justice for Mahsa Amini Topic 28: Water Supply Protests in J&K Topic 29: Nationwide Protests Against Trump Administration's Policies and Support for Civil Rights Topic 30: Women's Protests Against Military Coup in Support of Civil Disobedience Movement in [Region] Topic 31: Political Violence in West Bengal Elections Topic 32: Protests Against Far-Right Ideologies and AfD Activities Topic 33: Taliban Violations Against Women and Girls Topic 34: Police Violence Against Women Topic 35: Ambazonian Separatists' Violent Acts Against Civilians and Alleged Collaborators Topic 36: Mothers' Day Protest for Missing Persons Justice Topic 37: Protests Against Constitutional Revision of Article 9 in Japan Topic 38: Violent Incidents Involving Armed Individuals Targeting Women in Michoacan Topic 39: Black Lives Matter Protests and Advocacy Against Police Brutality Topic 40: International Women's Day March for Gender Equality and Rights Topic 41: Protests Against Taliban's Women's Rights Violations Topic 42: Houthi-Sponsored Protests Supporting Palestinians Against Israeli Actions Topic 43: Protests Against Isolation Policy of Abdullah Ocalan by Kurdish Groups Topic 44: Kazakh Female Protests for Subsidized Housing and Social Assistance Topic 45: Houthi Sniper Attacks on Civilians in Yemen Topic 46: Global Climate Activism Protests by Extinction Rebellion Topic 47: Lady Health Workers' Salary Protests Topic 48: Israeli Settler Attacks on Palestinians in West Bank Topic 49: Pro-Choice Demonstrations Following Overturn of Roe v. Wade Topic 50: Red Dress Day Awareness for Missing and Murdered Indigenous Women Topic 51: Drug Trafficking-Related Shootings of Women Topic 52: Protests Against Gender-Based Violence in Universities Topic 53: Women's Protests Against Relocation of Liquor Shops Topic 54: Pro-Palestinian Demonstrations Amid Israel-Hamas Conflict Topic 55: Sexual Misconduct by Police Officers Topic 56: Women's Rights Activism and Demonstrations Topic 57: Motorcycle Drive-by Shootings Targeting Women Topic 58: Imbonerakure Violence against Women and Minors Topic 59: Protests Against Gender-Based Violence in November and December Topic 60: Protests Against Citizenship Amendment Act and National Register of Citizens Topic 61: Women's March 2020 Topic 62: International Women's Day Protests for Gender Equality Topic 63: Killings by Unidentified Armed Groups in Kivu and Surrounding Regions Topic 64: Female Homicides Involving Burning of Bodies Topic 65: Ukrainian Protests Against Russian Invasion Topic 66: Women's Rights and Abortion Protests Topic 67: Military Attacks and Arson in Villages Topic 68: Protests Against Sexual Harassment in Wrestling Federation Topic 69: Parent Protests in Elementary Schools Over Administration Issues and Resource Management Topic 70: Political Candidate Attacks and Violence Topic 71: Wartime Sexual Violence and Killings Involving Russian and Ukrainian Forces Topic 72: Hijab Protests and Counter-Protests in Educational Institutions Topic 73: Saturday Mothers' Weekly Protests for Justice Topic 74: Women's Protests and Public Vandalism on International Women's Day Topic 75: Abortion Rights Protests and Counter-Demonstrations Topic 76: Mob Violence Against Women Suspected as Child Lifters Topic 77: Topic: Fulani-Related Violence and Attacks on Civilians Topic 78: Off-Duty Female Police Officer Fatalities by Armed Attackers Topic 79: HDP Protests Against Dismissal of Mayors Topic 80: Protests Against Dismissal of DEM Party Mayor Mehmet Siddik Akis and Trustee Appointment Topic 81: Armed Violence and Fatalities Involving FARC Dissidents and ELN in Rural Areas Topic 82: Protests by Female Street Traders Against Market Expulsions and Relocations Topic 83: Inter-Community Violence and Protest in Meitei and Tribal Areas Topic 84: Caste-Based Sexual Violence Against Dalit Women in Uttar Pradesh Topic 85: Abortion Rights Protests Following Leaked Supreme Court Draft Topic 86: International Protest Against Gender-Based Violence Topic 87: Nationwide Flower Demonstrations Against Sexual Violence Acquittals Topic 88: Protests Against Netanyahu's Judicial Overhaul Topic 89: International Women's Day Demonstrations and Women's Issues Topic 90: Women's Protests Against Energy Shortages in KPK Topic 91: Protests by Swedish-Iranian Community against Death of Kurdish Woman in Police Custody Topic 92: Student Protests Against School Handling of Sexual Assault and Misconduct Topic 93: Protests Against Rape and Murder of Woman Veterinary Doctor Topic 94: Protests Against Supreme Court's Dobbs Decision on Abortion Rights Topic 95: Protests Against Violence Towards Women Topic 96: Protests Against Macron's Appointment of Michel Barnier as Prime Minister Topic 97: Protests Against Violence and Rape in Bangladesh Topic 98: Gang-Related Violence Against Women Topic 99: Female Textile Workers' Protests for Unpaid Wages Topic 100: Women's March Protesting Amy Coney Barrett's Supreme Court Nomination Topic 101: Women's Strike Movement Protesting Abortion Law Restrictions Topic 102: Women's Protests for Political Prisoners' Immediate Release in Bahrain Topic 103: Military Police Officers' Wives Protesting Overdue Salaries Topic 104: Violence and Human Rights Abuses by Mayi-Mayi Militias in Kivu Region Topic 105: Women's Rights Demonstrations in Dalarna Topic 106: University Fraternity Sexual Assault Protests Topic 107: Women's Rights Activists Protest for Hostage Release from Hamas Topic 108: Protests Against Agricultural Ordinances and Debt Waivers for Farmers Topic 109: Protests Against Turkish Military Operations in Kurdish Regions Topic 110: Boeung Kak and Borei Keila Land Disputes and Protests in Cambodia Topic 111: Women's Rally Against Abortion Restrictions Topic 112: Supreme Court and Abortion Legislation Protests Topic 113: Alleged Poisoning and Mass Sociogenic Illness in Schools Topic 114: Women's Protests Against Abortion Restrictions Topic 115: Protests Against Bolsonaro's Presidency and Gender-Based Violence in Brazil Topic 116: Turkish Withdrawal from Women's Rights Convention Protest Topic 117: Fuel Price Hikes and Socioeconomic Protests Topic 118: Al Shabaab Attacks on Civilians and Officials Topic 119: Supreme Court Draft Opinion and Roe v. Wade Protests Topic 120: Vigilante Killings of Female Drug Suspects Topic 121: Breast Cancer Awareness and Advocacy Topic 122: Take Back the Night Marches Against Gender-Based Violence Topic 123: Women's Protests Against Government Withdrawal from Domestic Violence Convention Topic 124: Sexual Violence by Armed Groups Topic 125: Saturday Mothers' Protests for Detainee Justice Topic 126: Houthi-Sponsored Demonstrations Against Israeli and Western Actions Topic 127: Abortion Rights Protests in Response to Leaked Supreme Court Draft Topic 128: Vanessa Guillen Vigil and Justice Protests Topic 129: Election Protests and Allegations of Fraud Topic 130: Student Protests for Justice and Police Reform Topic 131: Protests for Health Access for Severely Ill Prisoners Topic 132: Political Protests by Women Wings Against Arrests in Pakistan Topic 133: Protests Against Islamic Government's Hijab Policy Topic 134: International Women's Day Protests for Gender Equality and Labor Rights Topic 135: Protests Against Insecurity and Fulani Pastoralists in Akoko Region Topic 136: Nationwide Flower Demonstrations Against Sexual Violence Acquittals Topic 137: Ethnic Conflict and Violence between Murle and Lou Nuer Communities Topic 138: Women's Activist Rally Against Abortion Restrictions Topic 139: PKK-affiliated Youth Kidnappings for Conscription in Rural Areas Topic 140: Protests by Families and Associations for the Release of Political Detainees and Disappeared Individuals Topic 141: Detention of Women by QSD Forces in Rural Areas for Unknown Reasons Topic 142: ADF Rebel Attacks on Civilians in Villages Topic 143: Pro-Choice Demonstrations Responding to Supreme Court Draft Leak Topic 144: Civil Society Protests Against Gender-Based Violence in June Topic 145: Targeted Arson and Vandalism Against Political Figures' Properties Topic 146: Government Crackdown on Ladies in White Activism Topic 147: Protests Against Violence in Arab Israeli Communities Topic 148: International Women's Day Protests Against Gender Violence and Inequality Topic 149: International Women's Day Demonstrations for Equal Pay and Women's Rights Topic 150: International Day for the Elimination of Violence Against Women Protests Topic 151: NDWO Protests Against Inflation and Corruption Across 77 Districts Topic 152: International Women's Day Protests Against Femicides and for Women's Rights Topic 153: Mahsa Amini Protests and Mourning Observance in Iran Topic 154: Korean Farmers' Advocacy and Press Conferences Topic 155: Women's Protests Against HTS in Countryside Towns Topic 156: Nepali Congress Anti-Government Demonstrations Topic 157: Unsolved Home Invasion Murders of Women Topic 158: Opposition to Legalisation of Alcohol in the State through Women's Sit-in Demonstrations Topic 159: Iranian Women's Rights Protests Against Hijab Enforcement and Police Brutality Topic 160: Kurdish Newroz Celebrations and Demands for Abdullah Ocalan's Release Topic 161: Prison Violence Against Female Political Prisoners Topic 162: Protests Against Quran Burning by Far-Right Politician Topic 163: Baloch Sunni Protests Against Government Violence Topic 164: Dahalo Militia Abductions and Violence in Madagascar Topic 165: Houthi-Sponsored Protest in Solidarity with Palestinians Against Zionist Actions Topic 166: Protests for Justice and Prevention of Child Murders Topic 167: Advocacy for Prostitution Law Amendment Topic 168: Labor Rights Protest by Domestic and Care Workers in the Basque Region Topic 169: Student Protests Against Leaked Supreme Court Draft on Abortion Rights Topic 170: Protests Against Gender-Based Violence on the International Day for the Elimination of Violence Against Women Topic 171: Women's Rally Against Abortion Restrictions Topic 172: International Day for the Elimination of Violence Against Women Demonstrations Topic 173: Alleged Poisonings and Mass Hysteria in Schools Topic 174: National Strike Against Austerity and for Wage Increases Topic 175: Anti-Nuclear Power Plant Protests and Advocacy Topic 176: Protests Against Rising Fuel Prices by Congress Women's Wing Topic 177: International Women's Day Demonstrations on Women's Rights and Inequality Issues Topic 178: International Women's Day Protests Against Femicides Topic 179: WOZA Educational Protests Topic 180: National Protests Against Austerity and For Wage Increases Topic 181: Women's Strike Movement Abortion Protests Topic 182: Sexual Harassment Allegations and Protests Topic 183: Women's Demand for Gender Equality in the Catholic Church Topic 184: Alleged Poisoning of Female Students and Protests Against Government Involvement Topic 185: RLD State-Wide Protests Against Communal Violence and Demand for President's Rule Topic 186: Women's Strike Movement Protests Against Abortion Restrictions Topic 187: Protests Against Macron's Support for Gerard Depardieu Amid Rape Accusations Topic 188: Anti-Far-Right Protests in France Following European Elections Topic 189: Protests Over Mahsa Amini's Death in Police Custody Topic 190: Advocacy for Comprehensive Laws Against Sexual Violence Topic 191: One Billion Rising Protests Against Violence on February 14 Topic 192: Taliban Media Restrictions and Gender-Based Bans Topic 193: Justice Protests for Nirmala Pant's Murder and Rape Case Topic 194: Mob Justice and Lynching of Women Accused of Crimes Topic 195: Prison Visit Suspensions and Protests Topic 196: Protest Against Attack on Jatiya Parishad State President Topic 197: Sexual Violence by SPLA and SPLA-IO Forces in Conflict Zones Topic 198: Meitei Community Protests and Government Response Topic 199: International Women's Day Protests for Gender Equality and Against Gender-Based Violence Topic 200: Gang Violence and Sexual Assault Amid G-9 and G-Pep La Clashes Topic 201: Protest Against Gender-Based Violence Following Children's Disappearance Topic 202: International Women's Day Protest for Gender Equality Topic 203: CODECO-URDPC Attacks on Civilians and Looting Activities Topic 204: Women's Strike Rally Against Abortion Restrictions Topic 205: Houthi-Sponsored Protests in Solidarity with Palestine and Opposition to Israel Topic 206: Houthi-Sponsored Protest Commemorating Saleh Ali Al Samad and Supporting Palestinian Solidarity Topic 207: Detainment and Re-education of Uyghur and Kazakh Women in China Topic 208: Women's Strike Protests Against Abortion Restrictions Topic 209: Protests Over Rape-Murder of PGT Doctor at RG Kar Medical College Topic 210: Advocacy for Missing Persons Detained by the State Topic 211: Women's Rights and Abortion Protests Topic 212: International Women's Day Demonstrations for Equal Rights and Pay Topic 213: Protests Over Benazir Income Support Programme (BISP) Issues Topic 214: Protest Against Agriculture Minister's Derogatory Remarks by BJP Mahila Morcha Topic 215: Environmental and Economic Opposition to Iron Sand Mining Topic 216: Demonstration Against Violence on Women Before International Day Topic 217: Political Violence and Intimidation in Zimbabwean Party Conflicts Topic 218: Protests Commemorating EDSA People Power Revolution and Opposing Current Philippine Leadership Topic 219: Protests Against Violence on International Day for the Elimination of Violence Against Women Topic 220: Women's Protest Against Government Apathy Amid COVID-19 Migrant Crisis Topic 221: Meira Paibi Protests Against NIA Case on Arambai Tenggol Amidst Meitei-Tribal Violence Topic 222: Student Protests Against Rape and Violence in Country on February 23 Topic 223: Female Health Workers' Protest for Pending Salaries Topic 224: Maternity Ward Closures and Protests for Reopening Emergency Services Topic 225: Sudanese Demonstrations Against Military Coup and Arrests Topic 226: Abortion Rights and Religious Protest Dynamics Topic 227: Protests Against AFSPA and Arrest of PREPAK Members in Manipur Topic 228: Advocacy for Voting Rights and Commemoration of January 6th Capitol Riots Anniversary Topic 229: Student Protests Against Government Over Death in Police Custody Topic 230: Arrest and Detention of Women in Syrian and Turkish Countryside for Unknown Reasons Topic 231: Houthi Protests and Zaydi Shiite Commemoration Activities Topic 232: Political Protests and Impeachment of President Yoon Suk-yeol Topic 233: Moroccan Protests Against Israeli Operations and Normalization Topic 234: Drug Trafficking and Torture Incidents Topic 235: Houthi-Sponsored Protest and Solidarity with Palestinians and Lebanese on 10th Anniversary Topic 236: Meitei Community's Demand for Scheduled Tribe Status Topic 237: Women's Protests and Political Demands in Artsakh and Belarus Topic 238: Women's Protests Against Microfinance Loan Policies Topic 239: Protests Against Indian Government's Actions in Kashmir Topic 240: Protests Against Government Pressure on Independent Media Topic 241: Protests Against Rising Violence and Rape in District Topic 242: Medically Assisted Procreation Law Protests and Counter-Protests Topic 243: Protests for Drug Haul Case Reinvestigation Topic 244: Violence by Presumed Muslim Separatists Against Female Civilians in Thai Villages Topic 245: Protests Against Polish Constitutional Court's Abortion Ban Topic 246: Anti-Nuclear Protests and Peace Wave Movement Commemorating Atomic Bombings Topic 247: Women's Rights Demonstrations and Police Intervention Topic 248: Houthi-Sponsored Protest Commemorating Tanumah Massacre and Solidarity with Palestine Topic 249: Midwives' Protest Against Law Affecting Job Security Topic 250: Indigenous Rights Protest for Missing Women at Landfills Topic 251: Women's Rights and Abortion Legalization Protests on International Women's Day Topic 252: Women's Protest Against Abortion Restrictions Topic 253: Women's Rights Protests and 'A Rapist in Your Path' Performances Topic 254: Anti-War Protests Organized by Codepink Topic 255: Military Service Deaths Protest Movement Topic 256: Protests Against Modesty Police Following Death of Young Woman Topic 257: Protests against Morality Police after Mahsa Amini's Death Topic 258: Demonstrations Supporting Rule of Law Against Far-Right Leader's Conviction Topic 259: Nationwide Farmers' Protest Against Farm Laws Topic 260: Women's Rights and Demonstrations in Pakistan Topic 261: Comfort Women Protests at Former Japanese Embassy in Seoul Topic 262: Nationwide 'Bigger than Roe' Protests on Roe v. Wade 50th Anniversary Topic 263: Protests Against Valeri Simeonov over Insulting Remarks to Mothers of Children with Disabilities Topic 264: Enforcement of Religious Dress Codes by Iranian Morality Police
In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Convert only necessary columns to pandas, preserving order
df_plot = pl_df_clean.select(["country", "notes_clean"]).drop_nulls().to_pandas()
# Ensure the length matches the number of topics
assert len(df_plot) == len(topics), "❌ 'topics' length does not match the cleaned documents."
# Assign topics to cleaned documents
df_doc_topics = pd.DataFrame({
"country": df_plot["country"].values,
"topic": topics
})
# Count topics per country
topic_counts = df_doc_topics.groupby(["country", "topic"]).size().reset_index(name="count")
# Relative percentage per country
topic_counts["total_country"] = topic_counts.groupby("country")["count"].transform("sum")
topic_counts["percentage"] = 100 * topic_counts["count"] / topic_counts["total_country"]
# Get topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
{"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
for topic_id, label_info in openai_topics.items()
])
# Merge with labels
topic_counts["topic"] = topic_counts["topic"].astype(int)
labels_df["topic"] = labels_df["topic"].astype(int)
topic_counts = topic_counts.merge(labels_df, on="topic", how="left")
# Top-N topics per country
top_n = 5
top_topics_per_country = topic_counts.sort_values(["country", "percentage"], ascending=[True, False])\
.groupby("country").head(top_n)
# Limit to only 3 countries
countries_to_plot = top_topics_per_country["country"].drop_duplicates().sort_values().head(6)
top_topics_per_country = top_topics_per_country[top_topics_per_country["country"].isin(countries_to_plot)]
# Plot with Seaborn
g = sns.FacetGrid(
top_topics_per_country,
col="country",
col_wrap=3,
sharey=False,
height=4,
aspect=1.5
)
g.map_dataframe(
sns.barplot,
x="percentage",
y="label",
palette="tab10"
)
g.set_titles(col_template="{col_name}")
g.set_axis_labels("Percentage (%)", "Topic")
for ax in g.axes.flatten():
for label in ax.get_yticklabels():
label.set_rotation(0)
plt.tight_layout()
plt.show()
In [46]:
openai_labels = [
label[0][0].split("\n")[0]
for label in topic_model.get_topics(full=True)["OpenAI"].values()
]
topic_model.set_topic_labels(openai_labels)
topic_model.visualize_documents(titles, reduced_embeddings=reduced_embeddings, hide_annotations=True, hide_document_hover=False, custom_labels=True)
In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Convert required columns from Polars
df_plot = pl_df_clean.select(["event_date", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])
# Validate topic length
assert len(df_plot) == len(topics), "❌ Length of 'topics' does not match cleaned documents."
# Create DataFrame of topics per document
df_doc_topics = pd.DataFrame({
"event_date": df_plot["event_date"].values,
"topic": topics
})
# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()
# Group by month and topic
monthly_topic_counts = df_doc_topics.groupby(["month", "topic"]).size().reset_index(name="count")
# Calculate percentage within each month
monthly_topic_counts["monthly_total"] = monthly_topic_counts.groupby("month")["count"].transform("sum")
monthly_topic_counts["percentage"] = 100 * monthly_topic_counts["count"] / monthly_topic_counts["monthly_total"]
# Get topic labels from the model
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
{"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
monthly_topic_counts["topic"] = monthly_topic_counts["topic"].astype(int)
monthly_topic_counts = monthly_topic_counts.merge(labels_df, on="topic", how="left")
# Select top-N global topics by volume
top_n = 5
top_topics = (
monthly_topic_counts.groupby("topic")["count"]
.sum()
.nlargest(top_n)
.index.tolist()
)
df_top = monthly_topic_counts[monthly_topic_counts["topic"].isin(top_topics)]
# Monthly percentage line plot
plt.figure(figsize=(12, 6))
ax = sns.lineplot(data=df_top, x="month", y="percentage", hue="label", marker="o")
plt.legend(title="Topic (OpenAI)", bbox_to_anchor=(1.02, 1), loc="upper left", borderaxespad=0)
plt.title(f"Monthly trend (% of total) of the top {top_n} most frequent topics")
plt.xlabel("Month")
plt.ylabel("Percentage of documents (%)")
plt.xticks(rotation=45)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()
In [50]:
import itertools
import pandas as pd
# Define colors for the visualization to iterate over
colors = itertools.cycle(['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000'])
color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}
# Prepare dataframe and ignore outliers
dfo = pd.DataFrame({"x": reduced_embeddings[:, 0], "y": reduced_embeddings[:, 1], "Topic": [str(t) for t in topic_model.topics_]})
dfo["Length"] = [len(doc) for doc in documents]
dfo = dfo.loc[dfo.Topic != "-1"]
dfo = dfo.loc[(dfo.y > -10) & (dfo.y < 10) & (dfo.x < 10) & (dfo.x > -10), :]
dfo["Topic"] = dfo["Topic"].astype("category")
# Get centroids of clusters
mean_df = dfo.groupby("Topic").mean().reset_index()
mean_df.Topic = mean_df.Topic.astype(int)
mean_df = mean_df.sort_values("Topic")
In [52]:
import seaborn as sns
from matplotlib import pyplot as plt
from adjustText import adjust_text
import matplotlib.patheffects as pe
import textwrap
fig = plt.figure(figsize=(20, 20))
sns.scatterplot(
data=dfo,
x='x',
y='y',
hue='Topic',
palette=color_key,
alpha=0.4,
size='Length',
sizes=(10, 200),
legend=False
)
# Annotate top 50 topics
texts, xs, ys = [], [], []
for row in mean_df.iterrows():
topic = row[1]["Topic"]
name = textwrap.fill(topic_model.custom_labels_[int(topic)], 20)
if int(topic) <= 50:
xs.append(row[1]["x"])
ys.append(row[1]["y"])
texts.append(plt.text(row[1]["x"], row[1]["y"], name, size=10, ha="center", color=color_key[str(int(topic))],
path_effects=[pe.withStroke(linewidth=0.5, foreground="black")]
))
# Adjust annotations such that they do not overlap
adjust_text(texts, x=xs, y=ys, time_lim=1, force_text=(0.01, 0.02), force_static=(0.01, 0.02), force_pull=(0.5, 0.5))
plt.axis('off')
plt.legend('', frameon=False)
plt.show()
In [62]:
# Extract required data
df_plot = pl_df_clean.select(["event_date", "country", "notes_clean"]).drop_nulls().to_pandas()
df_plot["event_date"] = pd.to_datetime(df_plot["event_date"])
# Validate length
assert len(df_plot) == len(topics), "❌ Length of topics does not match the documents."
# Create base DataFrame
df_doc_topics = pd.DataFrame({
"event_date": df_plot["event_date"].values,
"country": df_plot["country"].values,
"topic": topics
})
# Add month column
df_doc_topics["month"] = df_doc_topics["event_date"].dt.to_period("M").dt.to_timestamp()
# Group by month, country, and topic
grouped = df_doc_topics.groupby(["month", "country", "topic"]).size().reset_index(name="count")
# Calculate % per month and country
grouped["total"] = grouped.groupby(["month", "country"])["count"].transform("sum")
grouped["percentage"] = 100 * grouped["count"] / grouped["total"]
# Add OpenAI topic labels
openai_topics = topic_model.get_topics(full=True)["OpenAI"]
labels_df = pd.DataFrame([
{"topic": topic_id, "label": label_info[0][0].split("\n")[0]}
for topic_id, label_info in openai_topics.items()
])
labels_df["topic"] = labels_df["topic"].astype(int)
grouped["topic"] = grouped["topic"].astype(int)
grouped = grouped.merge(labels_df, on="topic", how="left")
# Filter top-N agendas per country and month
top_n = 3
top_agendas = grouped.sort_values(["month", "country", "percentage"], ascending=[True, True, False])\
.groupby(["month", "country"]).head(top_n)
top_agendas.head()
Out[62]:
| month | country | topic | count | total | percentage | label | |
|---|---|---|---|---|---|---|---|
| 0 | 1997-01-01 | Niger | -1 | 1 | 1 | 100.0 | Protests Against Gender-Based Violence and Wom... |
| 1 | 1997-01-01 | Rwanda | -1 | 1 | 1 | 100.0 | Protests Against Gender-Based Violence and Wom... |
| 2 | 1997-01-01 | Sierra Leone | -1 | 1 | 1 | 100.0 | Protests Against Gender-Based Violence and Wom... |
| 3 | 1997-02-01 | Algeria | -1 | 1 | 1 | 100.0 | Protests Against Gender-Based Violence and Wom... |
| 4 | 1997-03-01 | Uganda | -1 | 1 | 1 | 100.0 | Protests Against Gender-Based Violence and Wom... |
In [66]:
top_agendas.to_csv("Output/Trained_Agendas.csv", index=False)
In [68]:
# Monthly Classification
In [70]:
!pip install sentence-transformers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Requirement already satisfied: sentence-transformers in /opt/anaconda3/lib/python3.11/site-packages (5.0.0) Requirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.53.2) Requirement already satisfied: tqdm in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.65.0) Requirement already satisfied: torch>=1.11.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (2.2.2) Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.6.1) Requirement already satisfied: scipy in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (1.11.4) Requirement already satisfied: huggingface-hub>=0.20.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (0.33.4) Requirement already satisfied: Pillow in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (10.2.0) Requirement already satisfied: typing_extensions>=4.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from sentence-transformers) (4.14.0) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (3.13.1) Requirement already satisfied: fsspec>=2023.5.0 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2023.6.0) Requirement already satisfied: packaging>=20.9 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (23.1) Requirement already satisfied: pyyaml>=5.1 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (6.0.1) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (2.32.3) Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /opt/anaconda3/lib/python3.11/site-packages (from huggingface-hub>=0.20.0->sentence-transformers) (1.1.5) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (1.12) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1) Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.11/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.3) Requirement already satisfied: numpy>=1.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (1.26.4) Requirement already satisfied: regex!=2019.12.17 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (2023.10.3) Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.21.2) Requirement already satisfied: safetensors>=0.4.3 in /opt/anaconda3/lib/python3.11/site-packages (from transformers<5.0.0,>=4.41.0->sentence-transformers) (0.5.2) Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in /opt/anaconda3/lib/python3.11/site-packages (from scikit-learn->sentence-transformers) (3.5.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.11/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (3.10) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.11/site-packages (from requests->huggingface-hub>=0.20.0->sentence-transformers) (2025.4.26) Requirement already satisfied: mpmath>=0.19 in /opt/anaconda3/lib/python3.11/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)
In [84]:
import pandas as pd
# Load the original file
df_women = pd.read_csv("Input/Jun25.csv")
# Filter rows where 'assoc_actor_1' mentions "Women"
df_filtered = df_women[df_women["assoc_actor_1"].astype(str).str.contains("Women", case=False, na=False)].copy()
# Show the first 5 results
df_filtered.head()
Out[84]:
| event_id_cnty | event_date | year | time_precision | disorder_type | event_type | sub_event_type | actor1 | assoc_actor_1 | inter1 | ... | location | latitude | longitude | geo_precision | source | source_scale | notes | fatalities | tags | timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36 | MAA2233 | 30 June 2025 | 2025 | 1 | Demonstrations | Protests | Peaceful protest | Protesters (Mauritania) | Labor Group (Mauritania); Women (Mauritania) | Protesters | ... | Nouadhibou | 20.9434 | -17.0380 | 1 | Al Akhbar (Mauritania) | National | On 30 June 2025, a large number of women fish ... | 0 | crowd size=large | 1751941686 |
| 50 | MEX106699 | 30 June 2025 | 2025 | 1 | Demonstrations | Protests | Peaceful protest | Protesters (Mexico) | Women (Mexico) | Protesters | ... | Mazatlan | 23.2003 | -106.4222 | 1 | Noroeste | Subnational | On 30 June 2025, in Mazatlan, Sinaloa, almost ... | 0 | crowd size=almost 100 | 1751941687 |
| 78 | TUR45013 | 30 June 2025 | 2025 | 1 | Demonstrations | Protests | Peaceful protest | Protesters (Turkey) | Lawyers (Turkey); Women (Turkey) | Protesters | ... | Elazig | 38.6743 | 39.2232 | 1 | Evrensel | National | On 30 June 2025, women and lawyers gathered in... | 0 | crowd size=no report | 1751941772 |
| 188 | IND188094 | 30 June 2025 | 2025 | 1 | Demonstrations | Protests | Peaceful protest | Protesters (India) | Sonowal Kachari Ethnic Group (India); Students... | Protesters | ... | Dibrugarh | 27.4727 | 94.9121 | 1 | Pratidin Time; Times of India | Subnational-National | On 30 June 2025, All Assam Sonowal Kachari Stu... | 0 | crowd size=massive | 1752014372 |
| 192 | IND188187 | 30 June 2025 | 2025 | 1 | Demonstrations | Protests | Peaceful protest | Protesters (India) | AAP: Aam Aadmi Party; Former Government of Ind... | Protesters | ... | Delhi - Rajouri Garden | 28.6331 | 77.1051 | 1 | Amar Ujala; Deccan Chronicle | National | On 30 June 2025, AAP MLAs, including the AAP D... | 0 | crowd size=no report | 1752014373 |
5 rows × 31 columns
In [86]:
# Apply the topic model to the 'notes' column of filtered data
documents = df_filtered["notes"].astype(str).tolist()
topics, probs = topic_model.transform(documents)
# Add topic results to the filtered DataFrame
df_filtered["topic"] = topics
df_filtered["probability"] = probs
# Get and clean topic labels
raw_labels = topic_model.get_topic_info()[["Topic", "Name"]].set_index("Topic")["Name"].to_dict()
clean_labels = {
topic_id: (
label.split("_", 1)[1] if topic_id != -1 and "_" in label else "Unassigned"
)
for topic_id, label in raw_labels.items()
}
df_filtered["topic_label"] = df_filtered["topic"].map(clean_labels)
Batches: 0%| | 0/16 [00:00<?, ?it/s]
2025-07-16 15:14:45,704 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings. 2025-07-16 15:14:46,236 - BERTopic - Dimensionality - Completed ✓ 2025-07-16 15:14:46,236 - BERTopic - Clustering - Approximating new points with `hdbscan_model` 2025-07-16 15:14:46,261 - BERTopic - Cluster - Completed ✓
In [108]:
# Count how many times each topic appears in the filtered data
topic_counts = df_women_filtered["topic"].value_counts().to_dict()
# Add the count column to each row
df_women_filtered["count"] = df_women_filtered["topic"].map(topic_counts)
In [110]:
# Filter out rows with topic -1 (unassigned)
df_women_filtered = df_filtered[df_filtered["topic"] != -1].copy()
# Count how many times each topic appears
topic_counts = df_women_filtered["topic"].value_counts().to_dict()
df_women_filtered["count"] = df_women_filtered["topic"].map(topic_counts)
# Show the first 5 results
print(df_women_filtered[["country", "topic", "topic_label", "count", "probability"]].sample(5))
country topic topic_label count probability 5096 Yemen 165 houthi_enemy_zionist_stand 6 0.257061 10743 United States 19 gun_moms_control_action 12 0.897034 5074 Turkey 125 ihd_saturday_taksim_weekly 1 0.690430 3055 United States 29 trump_musk_elon_donald 64 0.576834 4673 Nigeria 135 akoko_community_fulani_lga 2 1.000000
In [116]:
# Save cleaned file
df_women_filtered[["country", "topic", "topic_label", "count", "probability"]].to_csv("Output/Jun25Output.csv", index=False)